import scrapy
from scrapy import cmdline
from scrapy.http import HtmlResponse

class Top250Spider(scrapy.Spider):
    name = "top250"
    allowed_domains = ["movie.douban.com"]
    start_urls = ["https://movie.douban.com/top250"]

    def parse(self, response: HtmlResponse, **kwargs):
        li_list = response.xpath("//ol[@class='grid_view']/li")
        print('请求头:', response.request.headers)
        for li_temp in li_list:
            image = li_temp.xpath(".//img/@src").extract_first()
            title = li_temp.xpath(".//span[@class='title'][1]/text()").extract_first()
            rating_num = li_temp.xpath(".//span[@class='rating_num']/text()").extract_first()
            people_num = li_temp.xpath(".//div[@class='star']/span[4]/text()").extract_first()

            # 信息验证
            print('--->', image, title, rating_num, people_num)


if __name__ == '__main__':
    cmdline.execute('scrapy crawl top250'.split())


"""
1.将rebots协议关闭, 在settings文件中进行关闭: 在配置文件中的第19行
2.添加了UA头:在settings文件中配置: 在配置文件中的第39行
3.检查start_urls中的域名是否正确, 不正确则手动更改
4.数据解析验证
"""